### Optional script to remake analysis data from scratch
###
### Must be run in R 3.6.1, use Dockerfile included in replication archive

date()
library("tidyverse")
library("Amelia")
library("assertr")
library("countrycode")
library("foreach")
library("geosphere")
library("haven")
library("lubridate")
library("purrr")
library("WDI")
sessionInfo()

## Load datasets
raw_nmc <- read_csv("NMC_5_0.csv", na = "-9")
raw_states <- read_csv("states2016.csv")
raw_polity <- read_spss("p4v2017.sav")
raw_maddison <- read_stata("mpd2018.dta")
raw_pwt <- read_stata("pwt90.dta")
raw_wdi <- read_csv("wdi-download.csv")
raw_majors <- read_csv("majors2016.csv")
raw_latlong <- read_tsv("LatLong2011.raw",
                        col_names = c("ccode",
                                      "cabbr",
                                      "city",
                                      "is_main",
                                      "lat_deg",
                                      "lat_min",
                                      "lat_dir",
                                      "long_deg",
                                      "long_min",
                                      "long_dir",
                                      "year_min",
                                      "year_max",
                                      "country"),
                        skip = 1)
raw_atop <- read_csv("atop4_01dy.csv",
                     col_types = cols_only(year = col_double(),
                                           mem1 = col_double(),
                                           mem2 = col_double(),
                                           defense = col_integer(),
                                           offense = col_integer(),
                                           neutral = col_integer(),
                                           nonagg = col_integer(),
                                           consul = col_integer()))
raw_atop_s <- read_csv("atop_sscores.csv")
raw_contig <- read_csv("contdir.csv")
raw_mid_a <- read_csv("gml-mida-2.1.1.csv",
                      col_types = cols(
                          link1 = col_character(),
                          link2 = col_character(),
                          link3 = col_character(),
                          dispnum4 = col_integer()
                      ),
                      na = c("-9", "NA"))
raw_mid_b <- read_csv("gml-midb-2.1.1.csv",
                      col_types = cols(
                          dispnum4 = col_integer()
                      ),
                      na = c("-9", "NA"))
raw_mid_dyad <- read_csv("gml-ndy-disputes-2.1.1.csv",
                         col_types = cols(
                             revtype21 = col_integer(),
                             revtype22 = col_integer()
                         ),
                         na = c("-9", "NA"))
raw_mid_loc <- read_csv("MIDLOCA_2.0.csv")
raw_werner <- read_csv("wernerpeaceyears.csv")


## ----------------------------------------------------------------------
## Create state-year base from National Material Capabilities data
## ----------------------------------------------------------------------

## Remove extraneous version variable and validate
data_state_year <- raw_nmc %>%
    select(-version) %>%
    assert(not_na, ccode, stateabb, year)

## Merge in readable country names
data_states <- raw_states %>%
    group_by(ccode) %>%
    summarise(statenme = unique(statenme))
data_state_year <- data_state_year %>%
    left_join(data_states, by = "ccode") %>%
    select(ccode, stateabb, statenme, year, everything())


## ----------------------------------------------------------------------
## Merge in Polity data
## ----------------------------------------------------------------------

## Clean Polity data:
##
## * Convert -66 etc to missing
## * Remove everything but the polity score and the ID variables
## * Reduce to same temporal coverage as current NMC data (1816-2012)
data_polity <- raw_polity %>%
    select(ccode, year, country, polity2) %>%
    mutate(polity2 = if_else(polity2 %in% c(-66, -77, -88),
                             NA_real_,
                             polity2)) %>%
    filter(year >= min(data_state_year$year),
           year <= max(data_state_year$year)) %>%
    assert(in_set(seq(-10, 10, by = 1)), polity2) %>%
    assert(not_na, ccode:country)

## Fix country codes
##
## Polity data supposedly uses COW codes but in reality there's some differences
## between them
countries_nmc <- sort(unique(data_state_year$ccode))
countries_polity <- sort(unique(data_polity$ccode))
polity_minus_nmc <- setdiff(countries_polity, countries_nmc)
nmc_minus_polity <- setdiff(countries_nmc, countries_polity)

## 89: United Province CA (1824-1838)
##
## Consisted of component countries, with date of system entry per COW 2016:
##     Costa Rica (94) - 1920
##     El Salvador (92) - 1875
##     Guatemala (90) - 1868
##     Honduras (91) - 1899
##     Nicaragua (93) - 1900
##
## None of these remotely overlap with United Province CA, so nothing to do in
## terms of data merging

## 99: Gran Colombia (1821-1832)
##
## Treat as Colombia (100), using the Colombia coding for the year of overlap
data_polity <- data_polity %>%
    verify(max(year[ccode == 99]) == 1832) %>%
    verify(min(year[ccode == 100]) == 1832) %>%
    filter(ccode != 99 | year != 1832) %>%
    mutate(ccode = if_else(ccode == 99, 100, ccode))

## 324: Sardinia (1816-1861)
##
## Follow EUGene in merging this coding with Italy (325), using the Italy coding
## for the year of overlap
data_polity <- data_polity %>%
    verify(max(year[ccode == 324]) == 1861) %>%
    verify(min(year[ccode == 325]) == 1861) %>%
    filter(ccode != 324 | year != 1861) %>%
    mutate(ccode = if_else(ccode == 324, 325, ccode))

## 342: Serbia (1830-2012)
##
## Follow COW in treating this the same as Yugoslavia (345), no overlap
##
## Also merging into 345 the scores for 347, which Polity uses for Yugoslavia
## and Serbia & Montenegro; use the 342 scores in the overlap years
data_polity <- data_polity %>%
    verify(length(intersect(year[ccode == 342], year[ccode == 345])) == 0) %>%
    mutate(ccode = if_else(ccode == 342, 345, ccode)) %>%
    verify(identical(sort(intersect(year[ccode == 345], year[ccode == 347])),
                     c(1991, 2006))) %>%
    filter(ccode != 347 | !(year %in% c(1991, 2006))) %>%
    mutate(ccode = if_else(ccode == 347, 345, ccode))

## 348: Montenegro (2006-2012)
##
##   Code   COW Country   Polity Country
##   ----   -----------   --------------
##   341    Montenegro    Kosovo
##   347    Kosovo        Yugoslavia/Serbia & Montenegro
##   348    --            Montenegro
data_polity <- data_polity %>%
    verify(all(ccode != 347)) %>%
    mutate(ccode = if_else(ccode == 341, 347, ccode),
           ccode = if_else(ccode == 348, 341, ccode))

## 364: USSR (1922-1991)
##
## Treat as Russia (365), using the Russia coding for overlap year 1922
data_polity <- data_polity %>%
    verify(identical(intersect(year[ccode == 364], year[ccode == 365]),
                     1922)) %>%
    filter(ccode != 364 | year != 1922) %>%
    mutate(ccode = if_else(ccode == 364, 365, ccode))

## 525: South Sudan (2011-2012)
##
##   Code   COW Country   Polity Country
##   ----   -----------   --------------
##   525    --            South Sudan
##   625    Sudan         Sudan
##   626    South Sudan   Sudan-North
##
## First merge 626 into 625, treating as Sudan for the overlap year 2011 (South
## Sudan became independent just over halfway into the year), then move 525 into
## 626
data_polity <- data_polity %>%
    verify(identical(intersect(year[ccode == 625], year[ccode == 626]),
                     2011)) %>%
    filter(ccode != 626 | year != 2011) %>%
    mutate(ccode = if_else(ccode == 626, 625, ccode),
           ccode = if_else(ccode == 525, 626, ccode))

## 529: Ethiopia (1993-2012)
##
## Merge with Ethiopia (530), using the more recent coding (529) for the overlap
## year 1993 since the Eritrean independence referendum occurred in the first
## half of the year
data_polity <- data_polity %>%
    verify(identical(intersect(year[ccode == 529], year[ccode == 530]),
                     1993)) %>%
    filter(ccode != 530 | year != 1993) %>%
    mutate(ccode = if_else(ccode == 529, 530, ccode))

## 564: Orange Free State (1854-1902)
##
## Not treated as a sovereign state by COW, so ignore

## 769: Pakistan (1947-1971)
##
## Merge with Pakistan (770)
data_polity <- data_polity %>%
    verify(length(intersect(year[ccode == 769], year[ccode == 770])) == 0) %>%
    mutate(ccode = if_else(ccode == 769, 770, ccode))

## 818: Vietnam (1976-2012)
##
## Merge with Vietnam (816), using the more recent coding (818) for the overlap
## year 1976 since the reunification happened juuuust barely in the second half
## of the year
data_polity <- data_polity %>%
    verify(identical(intersect(year[ccode == 816], year[ccode == 818]),
                     1976)) %>%
    filter(ccode != 816 | year != 1976) %>%
    mutate(ccode = if_else(ccode == 818, 816, ccode))

## Too small to be tracked by Polity
##
## 31: Bahamas (1973-2012)
## 53: Barbados (1966-2012)
## 54: Dominica (1978-2012)
## 55: Grenada (1974-2012)
## 56: St. Lucia (1979-2012)
## 57: St. Vincent and the Grenadines (1979-2012)
## 58: Antigua & Barbuda (1981-2012)
## 60: St. Kitts and Nevis (1983-2012)
## 80: Belize (1981-2012)
## 221: Monaco (1993-2012)
## 223: Liechtenstein (1990-2012)
## 232: Andorra (1993-2012)
## 240: Hanover (1837-1866)
## 273: Hesse Electoral (1816-1866)
## 275: Hesse Grand Ducal (1816-1867)
## 280: Mecklenburg Schwerin (1843-1867)
## 331: San Marino (1992-2012)
## 338: Malta (1964-2012)
## 395: Iceland (1944-2012)
## 403: Sao Tome and Principe (1975-2012)
## 511: Zanzibar (1963-1964)
## 591: Seychelles (1976-2012)
## 781: Maldives (1965-2012)
## 835: Brunei (1984-2012)
## 935: Vanuatu (1981-2012)
## 946: Kiribati (1999-2012)
## 947: Tuvalu (2000-2012)
## 955: Tonga (1999-2012)
## 970: Nauru (1999-2012)
## 983: Marshall Islands (1991-2012)
## 986: Palau (1994-2012)
## 987: Federated States of Micronesia (1991-2012)
## 990: Samoa (1976-2012)

## 300: Austria-Hungary (1816-1918)
##
## Treat as Austria (305)
auh_years <- data_state_year %>%
    filter(ccode == 300) %>%
    pull(year)
data_polity <- rbind(data_polity,
                     data_polity %>%
                     filter(year %in% auh_years,
                            ccode == 305) %>%
                     mutate(ccode = 300))

## Only keep identifiers and the Polity2 score, double-check no duplicates
data_polity <- data_polity %>%
    select(ccode, year, polity2) %>%
    verify(!duplicated(paste0(ccode, ".", year)))

data_state_year <- data_state_year %>%
    left_join(data_polity, by = c("ccode", "year"))


## ----------------------------------------------------------------------
## Nuclear capability, coded as in Gartzke and Kroenig 2009 --- criterion is that the
## state must have a deliverable nuclear weapon
## ----------------------------------------------------------------------

## Set up data frame
data_nuclear <- tibble(ccode = numeric(0), year = numeric(0))
now <- max(data_state_year$year)

## USA (2): 1945-present
## Russia (365): 1949-present
## United Kingdom (200): 1952-present
## France (220): 1960-present
## China (710): 1964-present
## Israel (666): 1967-present
## India (750): 1988-present
## South Africa (560): 1982-1990
## Pakistan (770): 1990-present
data_nuclear <- data_nuclear %>%
    add_row(ccode = 2, year = 1945:now) %>%
    add_row(ccode = 365, year = 1949:now) %>%
    add_row(ccode = 200, year = 1952:now) %>%
    add_row(ccode = 220, year = 1960:now) %>%
    add_row(ccode = 710, year = 1964:now) %>%
    add_row(ccode = 666, year = 1967:now) %>%
    add_row(ccode = 750, year = 1988:now) %>%
    add_row(ccode = 560, year = 1982:1990) %>%
    add_row(ccode = 770, year = 1990:now)
data_nuclear$nuclear <- 1

## Merge into state-year data
data_state_year <- data_state_year %>%
    left_join(data_nuclear, by = c("ccode", "year")) %>%
    mutate(nuclear = if_else(is.na(nuclear), 0, nuclear)) %>%
    assert(in_set(0:1, allow.na = FALSE), nuclear)


## ----------------------------------------------------------------------
## Maddison (1AD-2016) GDP data
## ----------------------------------------------------------------------

## Maddison data appears to use ISO3 country codes -- will try to merge them
## into the COW-based data with the countrycode package's conversion
data_iso <- countrycode::codelist_panel %>%
    as_tibble() %>%
    filter(!is.na(iso3c)) %>%
    select(ccode = cown, year, iso3c) %>%
    mutate(ccode = as.numeric(ccode),
           year = as.numeric(year))

data_maddison <- raw_maddison %>%
    filter(year >= min(data_state_year$year),
           year <= max(data_state_year$year)) %>%
    left_join(data_iso,
              by = c("countrycode" = "iso3c", "year" = "year")) %>%
    select(countrycode, country, ccode, year, everything()) %>%
    verify(is.na(ccode) | !duplicated(paste0(ccode, ".", year)))

## For merging with COW data, we can ignore Hong Kong, Puerto Rico, and
## Palestine.  Some caution required with the other ones though.
data_maddison <- data_maddison %>%
    filter(!(country %in% c("China, Hong Kong SAR", "Puerto Rico", "State of Palestine")))

## Czechoslovakia: Treated as separate from Czech Republic by COW
data_maddison <- data_maddison %>%
    mutate(ccode = if_else(country == "Czechoslovakia", 315, ccode))

## Former USSR: Want to use this rather than the "Russian Federation" coding for
## the years when Russia was the USSR
data_maddison <- data_maddison %>%
    filter(country != "Russian Federation" | year < 1923 | year > 1991) %>%
    filter(country != "Former USSR" | year %in% 1923:1991) %>%
    mutate(ccode = if_else(country %in% c("Russian Federation", "Former USSR"), 365, ccode))

## Former Yugoslavia: Want to use this rather than the "Serbia" coding for the
## years when Serbia was Yugoslavia
data_maddison <- data_maddison %>%
    filter(country != "Serbia" | year < 1918 | year > 2006) %>%
    filter(country != "Former Yugoslavia" | (year >= 1918 & year <= 2006)) %>%
    mutate(ccode = if_else(country %in% c("Serbia", "Former Yugoslavia"), 345, ccode))

## Carry forward/back for other missingness, verifying that each series has at
## least one non-NA ccode
data_maddison <- data_maddison %>%
    group_by(countrycode) %>%
    mutate(year_min = min(year[!is.na(ccode)]),
           year_max = max(year[!is.na(ccode)])) %>%
    assert(not_na, year_min, year_max) %>%
    mutate(ccode_min = ccode[year == year_min],
           ccode_max = ccode[year == year_max],
           ccode = if_else(is.na(ccode) & year < year_min, ccode_min, ccode),
           ccode = if_else(is.na(ccode) & year > year_max, ccode_max, ccode)) %>%
    ungroup()

## For cases where missingness is in a "gap" rather than at the edges, replace
## with the edge codes if both are equal
data_maddison <- data_maddison %>%
    mutate(ccode = if_else(is.na(ccode) & ccode_min == ccode_max, ccode_min, ccode))

## At this point all that should be left are some observations from Korea under
## Japanese occupation
data_maddison <- data_maddison %>%
    verify(!is.na(ccode) | (countrycode == "KOR" & year < 1949)) %>%
    mutate(ccode = if_else(is.na(ccode), 730, ccode))

## Reduce to the variables we care about: both of their measures of GDP per
## capita (as we'll be imputing into the Penn World Tables anyway), plus their
## estimate of population
data_maddison <- data_maddison %>%
    select(ccode, year,
           cgdppc_madd = cgdppc,
           rdgppc_madd = rgdpnapc,
           pop_madd = pop) %>%
    verify(!duplicated(paste0(ccode, ".", year)))

## Merge and validate
data_state_year <- data_state_year %>%
    left_join(data_maddison, by = c("ccode", "year")) %>%
    verify(!duplicated(paste0(ccode, ".", year))) %>%
    assert(not_na, ccode, stateabb, statenme, year)


## ----------------------------------------------------------------------
## Penn World Tables GDP data
## ----------------------------------------------------------------------

## Like the Maddison data, Penn World Tables uses ISO3 country codes, so need to perform a similar conversion
data_iso <- countrycode::codelist_panel %>%
    as_tibble() %>%
    filter(!is.na(iso3c)) %>%
    select(ccode = cown, year, iso3c) %>%
    mutate(ccode = as.numeric(ccode),
           year = as.numeric(year))

data_pwt <- raw_pwt %>%
    filter(year >= min(data_state_year$year),
           year <= max(data_state_year$year)) %>%
    left_join(data_iso,
              by = c("countrycode" = "iso3c", "year" = "year")) %>%
    select(countrycode, country, ccode, year, everything())

## First find series where there aren't country codes for any observation so
## they can be ruled out or mass-corrected
to_exclude <- data_pwt %>%
    group_by(country) %>%
    summarise(all_bad = all(is.na(ccode))) %>%
    filter(all_bad) %>%
    pull(country)

## # A tibble: 13 x 2
##    country                   all_bad
##    <chr>                     <lgl>
##  1 Anguilla                  TRUE
##  2 Aruba                     TRUE
##  3 Bermuda                   TRUE
##  4 British Virgin Islands    TRUE
##  5 Cayman Islands            TRUE
##  6 China, Hong Kong SAR      TRUE
##  7 China, Macao SAR          TRUE
##  8 Curaçao                   TRUE
##  9 Montserrat                TRUE
## 10 Serbia                    TRUE
## 11 Sint Maarten (Dutch part) TRUE
## 12 State of Palestine        TRUE
## 13 Turks and Caicos Islands  TRUE

## Exclude the ones that aren't included in COW, which is all of them but Serbia
to_exclude <- setdiff(to_exclude, "Serbia")
data_pwt <- data_pwt %>%
    filter(!(country %in% to_exclude))

## No Yugoslavia coded here, so we can just recode Serbia as Yugoslavia
## following the Correlates of War treatment
data_pwt <- data_pwt %>%
    mutate(ccode = if_else(country == "Serbia", 345, ccode))

## Carry forward/back for other missingness, verifying that each series has at
## least one non-missing code
data_pwt <- data_pwt %>%
    group_by(countrycode) %>%
    mutate(year_min = min(year[!is.na(ccode)]),
           year_max = max(year[!is.na(ccode)])) %>%
    assert(not_na, year_min, year_max) %>%
    mutate(ccode_min = ccode[year == year_min],
           ccode_max = ccode[year == year_max],
           ccode = if_else(is.na(ccode) & year < year_min, ccode_min, ccode),
           ccode = if_else(is.na(ccode) & year > year_max, ccode_max, ccode)) %>%
    ungroup()

## Only missingness left at this point should be a couple years of Syria, and
## there's no ambiguity in the countrycode there
data_pwt <- data_pwt %>%
    verify(!is.na(ccode) | countrycode == "SYR") %>%
    mutate(ccode = if_else(is.na(ccode) & countrycode == "SYR", 652, ccode)) %>%
    assert(not_na, ccode)

## Select variables
##
## The preferred measure in our empirical analysis will be PWT's "rgdpo",
## relabeled by us as simply "gdp_pwt", which is best for comparisons of
## "Productive capacity across countries and across years"
##
## Ideally would keep in most of them for imputation, but the problem is that
## they all have lots of missingness and this really increases the computational
## burden of the imputation
data_pwt <- data_pwt %>%
    select(ccode, year, rgdpo, rgdpe, cgdpe, cgdpo, pop) %>%
    rename(gdp = rgdpo) %>%
    select(ccode, year, gdp, everything()) %>%
    verify(!duplicated(paste0(ccode, ".", year)))
names(data_pwt) <- if_else(names(data_pwt) %in% c("ccode", "year"),
                           names(data_pwt),
                           paste0(names(data_pwt), "_pwt"))

## Merge and validate
data_state_year <- data_state_year %>%
    left_join(data_pwt, by = c("ccode", "year")) %>%
    verify(!duplicated(paste0(ccode, ".", year))) %>%
    assert(not_na, ccode, stateabb, statenme, year)


## ----------------------------------------------------------------------
## World Development Indicators
## ----------------------------------------------------------------------

## Remove "countries" that are aggregates
aggregates <- WDI_data$country %>%
    as_tibble() %>%
    filter(region == "Aggregates") %>%
    pull(iso2c)
data_wdi <- raw_wdi %>%
    filter(!(iso2c %in% aggregates))

## Sanify variable names and remove irrelevant years
data_wdi <- data_wdi %>%
    rename(pop_wdi = SP.POP.TOTL,
           gdp_wdi = NY.GDP.MKTP.PP.KD,
           pct_imports = NE.IMP.GNFS.ZS) %>%
    filter(year >= min(data_state_year$year),
           year <= max(data_state_year$year))

## Convert country codes to COW, using similar procedure to the previous GDP
## estimates
data_iso <- countrycode::codelist_panel %>%
    as_tibble() %>%
    filter(!is.na(iso2c)) %>%
    select(ccode = cown, year, iso2c) %>%
    mutate(ccode = as.numeric(ccode),
           year = as.numeric(year))

data_wdi <- data_wdi %>%
    left_join(data_iso, by = c("iso2c", "year")) %>%
    select(iso2c, country, ccode, year, everything())

## First find series in which all country codes are missing
to_exclude <- data_wdi %>%
    group_by(country) %>%
    summarise(all_bad = all(is.na(ccode))) %>%
    filter(all_bad) %>%
    pull(country)

## # A tibble: 26 x 2
##    country                   all_bad
##    <chr>                     <lgl>
##  1 American Samoa            TRUE
##  2 Aruba                     TRUE
##  3 Bermuda                   TRUE
##  4 British Virgin Islands    TRUE
##  5 Cayman Islands            TRUE
##  6 Channel Islands           TRUE
##  7 Curacao                   TRUE
##  8 Faroe Islands             TRUE
##  9 French Polynesia          TRUE
## 10 Gibraltar                 TRUE
## 11 Greenland                 TRUE
## 12 Guam                      TRUE
## 13 Hong Kong SAR, China      TRUE
## 14 Isle of Man               TRUE
## 15 Kosovo                    TRUE   (*)
## 16 Macao SAR, China          TRUE
## 17 Namibia                   TRUE   (*)
## 18 New Caledonia             TRUE
## 19 Northern Mariana Islands  TRUE
## 20 Puerto Rico               TRUE
## 21 Serbia                    TRUE   (*)
## 22 Sint Maarten (Dutch part) TRUE
## 23 St. Martin (French part)  TRUE
## 24 Turks and Caicos Islands  TRUE
## 25 Virgin Islands (U.S.)     TRUE
## 26 West Bank and Gaza        TRUE
##
## (*) = included in COW
to_exclude <- setdiff(to_exclude, c("Kosovo", "Namibia", "Serbia"))
data_wdi <- data_wdi %>%
    filter(!(country %in% to_exclude))

## No Yugoslavia coded here, so just recode Serbia as Yugoslavia per COW
data_wdi <- data_wdi %>%
    verify(is.na(ccode) | ccode != 345) %>%
    mutate(ccode = if_else(country == "Serbia", 345, ccode))

## Not sure why Kosovo is excluded
data_wdi <- data_wdi %>%
    verify(is.na(ccode) | ccode != 347) %>%
    mutate(ccode = if_else(country == "Kosovo", 347, ccode))

## Namibia seems to have been problematic since its 2-letter code is "NA"
data_wdi <- data_wdi %>%
    verify(is.na(ccode) | ccode != 565) %>%
    mutate(ccode = if_else(country == "Namibia", 565, ccode))

## Do the carry forward/back thing for others, this should kill all other
## missingness
data_wdi <- data_wdi %>%
    group_by(country) %>%
    mutate(year_min = min(year[!is.na(ccode)]),
           year_max = max(year[!is.na(ccode)])) %>%
    assert(not_na, year_min, year_max) %>%
    mutate(ccode_min = ccode[year == year_min],
           ccode_max = ccode[year == year_max],
           ccode = if_else(is.na(ccode) & year < year_min, ccode_min, ccode),
           ccode = if_else(is.na(ccode) & year > year_max, ccode_max, ccode)) %>%
    ungroup() %>%
    verify(!is.na(ccode))

## Only keep the information we need, and rescale the population and GDP figures
## to be in millions
data_wdi <- data_wdi %>%
    select(ccode, year, pop_wdi, gdp_wdi, pct_imports) %>%
    mutate(pop_wdi = pop_wdi / 10^6,
           gdp_wdi = gdp_wdi / 10^6) %>%
    verify(!duplicated(paste0(ccode, ".", year)))

## Merge and validate
data_state_year <- data_state_year %>%
    left_join(data_wdi, by = c("ccode", "year")) %>%
    verify(!duplicated(paste0(ccode, ".", year))) %>%
    assert(not_na, ccode, stateabb, statenme, year)


## ----------------------------------------------------------------------
## Major power status, via COW
## ----------------------------------------------------------------------

## Code major power state-years
data_majors <- raw_majors %>%
    rowwise() %>%
    do(tibble(ccode = .$ccode,
              year = seq(.$styear, .$endyear, by = 1))) %>%
    add_column(majpow = 1) %>%
    verify(!duplicated(paste0(ccode, ".", year)))

## Merge into state-year data
data_state_year <- data_state_year %>%
    left_join(data_majors, by = c("ccode", "year")) %>%
    mutate(majpow = if_else(is.na(majpow), 0, majpow)) %>%
    verify(!duplicated(paste0(ccode, ".", year))) %>%
    assert(in_set(0, 1, allow.na = FALSE), majpow) %>%
    assert(not_na, ccode, stateabb, statenme, year)


## ----------------------------------------------------------------------
## Latitude and longitude via EUGene
## ----------------------------------------------------------------------

## Remove non-capital cities and perform sanity checks
data_latlong <- raw_latlong %>%
    assert(in_set(0, 1), is_main) %>%
    filter(is_main == 1) %>%
    assert(within_bounds(0, 90), lat_deg) %>%
    assert(within_bounds(0, 180), long_deg) %>%
    assert(within_bounds(0, 60, include.upper = FALSE), lat_min, long_min) %>%
    assert(in_set("n", "s"), lat_dir) %>%
    assert(in_set("e", "w"), long_dir)

## To avoid missingness:
## * Extend observations as of 2011 to 2012
## * Extend Kosovo (347) observations backward to 2008
## * Add data for South Sudan (626)
data_latlong <- data_latlong %>%
    mutate(year_max = if_else(year_max == 2011, 2012, year_max)) %>%
    mutate(year_min = if_else(ccode == 347, 2008, year_min)) %>%
    add_row(ccode = 626,
            is_main = 1,
            lat_deg = 4,
            lat_min = 51,
            lat_dir = "n",
            long_deg = 31,
            long_min = 35,
            long_dir = "e",
            year_min = 2011,
            year_max = 2012)

## Convert latitudes and longitudes to single numbers
data_latlong <- data_latlong %>%
    mutate(latitude = lat_deg + (lat_min / 60),
           latitude = if_else(lat_dir == "s", -1 * latitude, latitude),
           longitude = long_deg + (long_min / 60),
           longitude = if_else(long_dir == "w", -1 * longitude, longitude)) %>%
    assert(within_bounds(-90, 90), latitude) %>%
    assert(within_bounds(-180, 180), longitude) %>%
    select(ccode, latitude, longitude, year_min, year_max)

## Convert to state-year format
data_latlong <- data_latlong %>%
    mutate(year = map2(year_min, year_max, seq)) %>%
    unnest(year) %>%
    select(ccode, year, latitude, longitude) %>%
    verify(!duplicated(paste(ccode, year)))

## Merge into main state-year data
data_state_year <- data_state_year %>%
    left_join(data_latlong, by = c("ccode", "year")) %>%
    assert(not_na, latitude, longitude)


## -----------------------------------------------------------------------------
## Prepare state-year data for imputation
## -----------------------------------------------------------------------------

data_to_impute <- as.data.frame(data_state_year)

## Basic information about each variable
data_vars <- data_to_impute %>%
    assert(not_na, ccode, stateabb, statenme, year) %>%
    select(-ccode, -stateabb, -statenme, -year) %>%
    gather(key = variable, value = value)

data_summ <- data_vars %>%
    group_by(variable) %>%
    summarise(pct_missing = mean(is.na(value)),
              min = min(value, na.rm = TRUE),
              mean = mean(value, na.rm = TRUE),
              med = median(value, na.rm = TRUE),
              max = max(value, na.rm = TRUE),
              sd = sd(value, na.rm = TRUE),
              binary = all(is.na(value) | value %in% c(0, 1)))

## Candiates for log transformation
to_log <- with(data_summ, variable[min > 0])

## Candidates for asinh transformation (mostly NMC variables)
to_asinh <- with(data_summ, variable[!binary & min == 0])

## Scale the transformation to best normalize the non-missing values of
## each of these variables
ks_stat <- function(x) {
    x <- x[!is.na(x)]
    ans <- ecdf(x)(x) - pnorm(x, mean = mean(x), sd = sd(x))
    max(abs(ans))
}

## Need to remember the transformation, since this isn't done within Amelia and
## we'll want to undo it afterward
thetas <- 2^seq(-10, 0, by = 0.5)
to_asinh_scales <- foreach (component = to_asinh, .combine = "c") %do% {
    x <- data_to_impute[[component]]
    loss <- foreach (s = thetas, .combine = "c") %do% {
        ks_stat(asinh(s * x))
    }
    thetas[which.min(loss)]
}
names(to_asinh_scales) <- to_asinh

## Apply transformations
for (x in to_asinh) {
    data_to_impute[[x]] <- asinh(to_asinh_scales[x] * data_to_impute[[x]])
}

## Make sure the imputed data doesn't violate the boundary conditions on these
bd <- matrix(NA_real_, 0, 3)
for (x in to_asinh) {
    idx <- which(names(data_to_impute) == x)
    bd <- rbind(bd, c(idx, 0, Inf))
}
bd <- rbind(bd, c(which(names(data_to_impute) == "polity2"), -10, 10))
for (i in seq_len(nrow(bd))) {
    stopifnot(all(data_to_impute[, bd[i, 1]] >= bd[i, 2], na.rm = TRUE))
    stopifnot(all(data_to_impute[, bd[i, 1]] <= bd[i, 3], na.rm = TRUE))
}

## Confirm that we don't need to worry about imputing any of the binary
## variables (nuclear and major power)
stopifnot(with(data_summ, all(pct_missing[binary] == 0.0)))

## id variables to leave out of the imputation altogether (need to leave in one
## country identifier to use cross-sectional imputation functionality)
##
## also leaving out latitude and longitude since it's unclear how to incorporate
## them properly into this linear-type model
to_exclude <- c("stateabb", "statenme", "latitude", "longitude")


## -----------------------------------------------------------------------------
## Impute
## -----------------------------------------------------------------------------

set.seed(1828)

imp_state_year <- amelia(data_to_impute,
                         m = 10,
                         p2s = 2,
                         idvars = to_exclude,
                         ts = "year",
                         cs = "ccode",
                         polytime = 1,
                         intercs = TRUE,
                         logs = to_log,
                         bounds = bd,
                         empri = 0.01 * nrow(data_to_impute))

## Extract the imputed datasets and untransform the asinh-transformed variables
imp_state_year <- imp_state_year$imputations
imp_state_year <- foreach (imp = imp_state_year) %do% {
    for (x in to_asinh) {
        imp[[x]] <- sinh(imp[[x]])
        imp[[x]] <- imp[[x]] / to_asinh_scales[x]
    }
    as_tibble(imp)
}


## ----------------------------------------------------------------------
## Add lags to imputed data
## ----------------------------------------------------------------------

imp_state_year <- foreach (imp = imp_state_year) %do% {
    ans <- imp %>%
        mutate(quality = if_else(milper > 0, milex / milper, 0))
    ans_lag <- ans %>%
        select(ccode, year, milex:cinc, quality) %>%
        mutate(year = year + 1)
    names(ans_lag) <- if_else(names(ans_lag) %in% c("ccode", "year"),
                              names(ans_lag),
                              paste0(names(ans_lag), "_lag_pure"))
    ans <- ans %>%
        left_join(ans_lag, by = c("ccode", "year"))

    ## When the lagged value is missing, use current year value (affects just 38
    ## cases of the final data, as confirmed in the final merge script)
    for (x in grep("_lag_pure$", names(ans), value = TRUE)) {
        x_orig <- gsub("_lag_pure$", "", x)
        x_lag <- sub("_lag_pure$", "_lag", x)
        ans[[x_lag]] <- if_else(is.na(ans[[x]]), ans[[x_orig]], ans[[x]])
    }

    ans %>%
        assert(not_na, -ends_with("_lag_pure"))
}


## ----------------------------------------------------------------------
## Create base (undirected) dyad-year dataset from state-year data
## ----------------------------------------------------------------------

## Calculate state pairs within each year
data_dyad_year <- data_state_year %>%
    group_by(year) %>%
    do(crossing(ccode_a = .$ccode,
                ccode_b = .$ccode)) %>%
    ungroup() %>%
    filter(ccode_a < ccode_b) %>%
    arrange(year, ccode_a, ccode_b)

## Merge in state abbreviations
data_states <- raw_states %>%
    group_by(ccode) %>%
    summarise(stateabb = unique(stateabb)) %>%
    verify(!duplicated(ccode))

data_dyad_year <- data_dyad_year %>%
    left_join(data_states %>% rename(ccode_a = ccode, stateabb_a = stateabb),
              by = "ccode_a") %>%
    left_join(data_states %>% rename(ccode_b = ccode, stateabb_b = stateabb),
              by = "ccode_b") %>%
    verify(!duplicated(paste0(ccode_a, ".", ccode_b, ".", year)))


## ----------------------------------------------------------------------
## Calculate S-scores from ATOP data
##
## Not just using the ones provided directly by ATOP because those have missing data in
## the S-scores, but we do check on the correlation between measures for the non-missing
## cases
## ----------------------------------------------------------------------


## Double-check that the universe of undirected dyads is the same and that the
## relevant variables are dichotomous
data_atop <- raw_atop %>%
    rename(ccode_a = mem1, ccode_b = mem2) %>%
    select(year, ccode_a, ccode_b, defense, offense, neutral, nonagg, consul) %>%
    filter(year >= !! min(data_dyad_year$year),
           year <= !! max(data_dyad_year$year)) %>%
    verify(ccode_a < ccode_b) %>%
    assert(in_set(0, 1, allow.na = FALSE),
           defense, offense, neutral, nonagg, consul) %>%
    assert(not_na, everything()) %>%
    verify(!duplicated(paste0(ccode_a, ".", ccode_b, ".", year)))

## Follow Singer and Small coding scheme as closely as possible:
## 0 = no alliance
## 1 = entente (consultation pact)
## 2 = neutrality or nonaggression pact
## 3 = defense or offense pact
data_atop <- data_atop %>%
    mutate(alliance = case_when(
               defense == 1 | offense == 1 ~ 3,
               neutral == 1 | nonagg == 1 ~ 2,
               consul == 1 ~ 1,
               TRUE ~ 0
           ))

## Merge into list of dyads and code any missing values as having no alliance
data_alliances <- data_dyad_year %>%
    left_join(data_atop, by = c("ccode_a", "ccode_b", "year")) %>%
    mutate(alliance = if_else(is.na(alliance), 0, alliance)) %>%
    verify(!duplicated(paste0(ccode_a, ".", ccode_b, ".", year)))


## Extract matrix of alliance ties from data frame of dyads (within a given
## year)
matrix_from_dyads <- function(dat) {
    a <- as.character(dat$ccode_a)
    b <- as.character(dat$ccode_b)

    ## Form storage matrix
    countries <- sort(unique(c(a, b)))
    n_countries <- length(countries)
    ans <- matrix(NA, nrow = n_countries, ncol = n_countries)
    rownames(ans) <- colnames(ans) <- countries

    ## Fill matrix
    ans[cbind(a, b)] <- dat$alliance
    ans[cbind(b, a)] <- dat$alliance
    diag(ans) <- 3

    ans
}

## Compute weighted S-score from matrix of alliance ties
##
## Assumes alliance ties are in {0, 1, 2, 3}
s_from_matrix <- function(mat, wt = rep(1, nrow(mat))) {
    stopifnot(all(mat %in% 0:3))

    ## Create a list containing each column of the matrix
    each_col <- split(mat, col(mat))

    ## Create a list of matrices of distances
    dist_mats <- lapply(each_col, function(x) abs(mat - x))

    ## Calculate weighted S from distance matrices
    s <- lapply(dist_mats, function(x) 1 - 2 * colSums(wt * x / 3) / sum(wt))
    s <- do.call(cbind, s)

    ## Return as data frame
    tibble(ccode_a = rownames(mat)[row(mat)],
           ccode_b = colnames(mat)[col(mat)],
           s_score = c(s)) %>%
        mutate_all(as.numeric)
}

## Calculate unweighted S
data_s_nowt <- data_alliances %>%
    group_by(year) %>%
    group_modify(~ {
        mat <- matrix_from_dyads(.x)
        s_from_matrix(mat)
    }) %>%
    rename(s_nowt = s_score)

## Calculate S weighted by CINC scores (0 weight for countries with no CINC
## score)
data_s_cinc <- data_alliances %>%
    group_by(year) %>%
    group_modify (~ {
        mat <- matrix_from_dyads(.x)
        yr <- unique(.y$year)
        countries <- as.numeric(rownames(mat))
        data_cinc <- filter(raw_nmc, year == !! yr)
        idx_countries <- match(countries, data_cinc$ccode)
        cinc <- data_cinc$cinc[idx_countries]
        cinc <- ifelse(is.na(cinc), 0, cinc)
        s_from_matrix(mat, wt = cinc)
    }) %>%
    rename(s_cinc = s_score)

## Merge into dyad-year base
data_dyad_year <- data_dyad_year %>%
    left_join(data_s_nowt, by = c("ccode_a", "ccode_b", "year")) %>%
    left_join(data_s_cinc, by = c("ccode_a", "ccode_b", "year")) %>%
    verify(!duplicated(paste0(ccode_a, ".", ccode_b, ".", year))) %>%
    assert(not_na, everything())

## Check the correlation with the S-scores provided directly by the ATOP project
data_both_s <- left_join(raw_atop_s,
                         data_dyad_year,
                         by = c("ccode1" = "ccode_a",
                                "ccode2" = "ccode_b",
                                "year" = "year"))
cor_nowt <- with(data_both_s, cor(s_un_atop, s_nowt, use = "complete"))
cor_cinc <- with(data_both_s, cor(s_wt_atop, s_cinc, use = "complete"))
stopifnot(cor_nowt > 0.95)
stopifnot(cor_cinc > 0.93)


## ----------------------------------------------------------------------
## Contiguity
## ----------------------------------------------------------------------

## Clean the data: each entry in the original is a contiguity relationship,
## transform this into dyad-year entries
data_contig <- raw_contig %>%
    verify(statelno < statehno) %>%
    select(ccode_a = statelno,
           ccode_b = statehno,
           contig = conttype,
           begin, end) %>%
    mutate(begin = begin %/% 100,
           end = end %/% 100) %>%
    rowwise() %>%
    do(tibble(ccode_a = .$ccode_a,
              ccode_b = .$ccode_b,
              year = seq(.$begin, .$end, by = 1),
              contig = .$contig)) %>%
    ungroup() %>%
    group_by(ccode_a, ccode_b, year) %>%
    summarise(contig = min(contig))

## Merge into dyad-year base
data_dyad_year <- data_dyad_year %>%
    left_join(data_contig, by = c("ccode_a", "ccode_b", "year")) %>%
    verify(!duplicated(paste0(ccode_a, ".", ccode_b, ".", year)))

## Criterion for binary contiguity measure: land border or separated by no more
## than 150 miles of water
data_dyad_year <- data_dyad_year %>%
    mutate(contig = if_else(!is.na(contig) & contig %in% 1:4, 1, 0)) %>%
    assert(not_na, everything())


## ----------------------------------------------------------------------
## Calculate dyadic peace years
## ----------------------------------------------------------------------

## Create data frame of MID incidence
##
## This data only goes until 2010, so the peace years for 2011-2012 will be off
## --- would need to fix that if imputing the dyadic data, but I don't think we
## are doing so
data_mid <- raw_mid_dyad %>%
    verify(ccode1 < ccode2) %>%
    assert(in_set(0, 1), sidea1, sidea2) %>%
    verify(sidea1 != sidea2) %>%
    select(ccode_a = ccode1, ccode_b = ccode2, year) %>%
    assert(not_na, everything()) %>%
    add_column(mid = 1)

## Eliminate duplicate observations
data_mid <- data_mid %>%
    group_by(ccode_a, ccode_b, year) %>%
    summarise(mid = max(mid))

## Merge into full data, calculating when MIDs aren't happening
data_dyad_year <- data_dyad_year %>%
    left_join(data_mid, by = c("ccode_a", "ccode_b", "year")) %>%
    verify(!duplicated(paste0(ccode_a, ".", ccode_b, ".", year))) %>%
    mutate(mid = if_else(is.na(mid), 0, mid)) %>%
    assert(in_set(0, 1), mid) %>%
    assert(not_na, everything())

## For 1816 dyads, merge in Werner's corrected peace years
data_werner <- raw_werner %>%
    verify(ccode1 < ccode2) %>%
    verify(peaceyrs > 0) %>%
    verify(year == 1816) %>%
    rename(ccode_a = ccode1, ccode_b = ccode2, py = peaceyrs)

data_py <- data_dyad_year %>%
    left_join(data_werner, by = c("ccode_a", "ccode_b", "year")) %>%
    verify(!duplicated(paste0(ccode_a, ".", ccode_b, ".", year))) %>%
    verify(year == 1816 | is.na(py)) %>%
    select(year, ccode_a, ccode_b, mid, py)

## Calculate two measures of peace years for a data frame containing
## observations from a dyad.  The first measure increments peace years by 1
## whenever there wasn't a MID in the previous year of observation, even if
## there was a gap in years.  The second measure resets to 0 whenever there is a
## gap.
calculate_py <- function(dat) {
    dat <- arrange(dat, year)
    if (is.na(dat$py[1]))
        dat$py[1] <- 0
    dat$py_alt <- dat$py

    if (nrow(dat) < 2)
        return(dat)

    for (i in seq(2, nrow(dat))) {
        if (dat$mid[i-1] == 1) {
            dat$py[i] <- dat$py_alt[i] <- 0
        } else {
            dat$py[i] <- dat$py[i-1] + 1
            dat$py_alt[i] <- if (dat$year[i-1] == dat$year[i] - 1) dat$py[i] else 0
        }
    }

    dat
}

data_py <- data_py %>%
    group_by(ccode_a, ccode_b) %>%
    do(calculate_py(.)) %>%
    ungroup() %>%
    select(-mid)

## Merge into main dyad-year data
data_dyad_year <- data_dyad_year %>%
    left_join(data_py, by = c("ccode_a", "ccode_b", "year")) %>%
    verify(!duplicated(paste0(ccode_a, ".", ccode_b, ".", year))) %>%
    verify(py >= py_alt) %>%
    assert(within_bounds(0, Inf), py, py_alt) %>%
    assert(not_na, everything())


## ----------------------------------------------------------------------
## Calculate dyadic major power measures
## ----------------------------------------------------------------------

data_dyad_year <- data_dyad_year %>%
    left_join(data_state_year %>%
              select(ccode_a = ccode, year, majpow_a = majpow),
              by = c("ccode_a", "year")) %>%
    left_join(data_state_year %>%
              select(ccode_b = ccode, year, majpow_b = majpow),
              by = c("ccode_b", "year")) %>%
    assert(in_set(0:1), starts_with("majpow")) %>%
    mutate(mp_any = as.numeric(majpow_a + majpow_b > 0),
           mp_all = as.numeric(majpow_a + majpow_b == 2)) %>%
    verify(mp_all <= mp_any) %>%
    select(-starts_with("majpow")) %>%
    assert(not_na, everything()) %>%
    verify(!duplicated(paste0(ccode_a, ".", ccode_b, ".", year)))


## ----------------------------------------------------------------------
## Calculate joint democracy
##
## Because this is missing for at least one state in 112 of our disputes, we need to make
## imputed copies of the dyad-year data too
## ----------------------------------------------------------------------

imp_dyad_year <- foreach(imp = imp_state_year) %do% {
    data_dyad_year %>%
        left_join(imp %>% select(ccode_a = ccode, year, polity2_a = polity2),
                  by = c("ccode_a", "year")) %>%
        left_join(imp %>% select(ccode_b = ccode, year, polity2_b = polity2),
                  by = c("ccode_b", "year")) %>%
        mutate(polity2_min = pmin(polity2_a, polity2_b),
               polity2_max = pmax(polity2_a, polity2_b)) %>%
        select(-polity2_a, -polity2_b) %>%
        assert(not_na, everything()) %>%
        verify(polity2_min <= polity2_max)
}

## ----------------------------------------------------------------------
## Calculate distance from each participant state to each MID
## ----------------------------------------------------------------------

## Master list of states involved in each MID and first year of involvement
data_mid <- raw_mid_b %>%
    select(id = dispnum, ccode, year = styear) %>%
    group_by(id, ccode) %>%
    summarise(year = min(year)) %>%
    ungroup()

## Merge in state names and latitudes/longtitudes
##
## Also make corrections to conform with how things are written in the MIDLOC
## descriptions
data_state_year <- imp_state_year[[1]] %>%
    select(ccode, year, name = statenme, latitude, longitude) %>%
    mutate(name = recode(name,
                         `United States of America` = "United States|USA",
                         `Korea` = "South Korea",
                         `Austria-Hungary` = "AUH",
                         `Turkey` = "Turkey|Ottoman",
                         `Democratic Republic of the Congo` = "DRC",
                         `Central African Republic` = "Central Africa Republic",
                         `Philippines` = "Philippines|Phillippines"))
data_mid <- data_mid %>%
    left_join(data_state_year, by = c("ccode", "year"))

## Validate and clean the MID location data
data_mid_loc <- raw_mid_loc %>%
    select(id = dispnum,
           disp_lat = midloc2_ylatitude,
           disp_long = midloc2_xlongitude,
           note = midloc2_location) %>%
    assert(not_na, id) %>%
    verify(!duplicated(id)) %>%
    assert(within_bounds(-90, 90), disp_lat) %>%
    assert(within_bounds(-180, 180), disp_long) %>%
    filter(complete.cases(.))

## Merge in MID location data
data_mid <- data_mid %>%
    left_join(data_mid_loc, by = "id") %>%
    filter(complete.cases(.))

## When is a participant state named in the verbal description of the MID
## location?
data_mid <- data_mid %>%
    mutate(in_state = str_detect(note, name))

## Calculate distance to MID location, replacing it with zero when the MID
## occurs within the state (as calculated somewhat noisily above)
data_mid_distance <- data_mid %>%
    mutate(distance = distGeo(cbind(longitude, latitude),
                              cbind(disp_long, disp_lat)),
           distance = distance / 1609.344,
           distance = if_else(in_state, 0.0, distance)) %>%
    assert(not_na, everything()) %>%
    assert(within_bounds(0, Inf), distance) %>%
    select(id, ccode, distance)


## ----------------------------------------------------------------------
## Assemble final dispute- and state-level data
## ----------------------------------------------------------------------

## --- Case exclusions/data cleaning ---

## Confirm that states only participate on different sides of the same dispute
## in WWII (dispute number 258)
changing_sides <- raw_mid_b %>%
    group_by(dispnum, ccode) %>%
    summarise(n_sides = length(unique(sidea))) %>%
    filter(n_sides != 1)
stopifnot(all(changing_sides$dispnum == 258))

## Calculate how much time each state spent on each side in WWII
time_in_258 <- raw_mid_b %>%
    filter(dispnum == 258) %>%
    mutate(start = ymd(paste(styear, stmon, 1, sep = "-")),
           end = ymd(paste(endyear, endmon, 28, sep = "-")),
           time = as.numeric(end - start)) %>%
    group_by(ccode) %>%
    summarise(time_a = sum(time[sidea == 1]),
              time_b = sum(time[sidea == 0])) %>%
    verify(time_a != time_b)

## Code each state on whichever side they spent the most time on (this puts
## France and Russia with the Allies; Italy, Bulgaria, Romania, and Finland with
## the Axis)
side_a_258 <- with(time_in_258, ccode[time_a > time_b])
side_b_258 <- with(time_in_258, ccode[time_a < time_b])
stopifnot(length(intersect(side_a_258, side_b_258)) == 0)

## Exclude side switchers from the participant-level data and confirm now no
## duplicates remaining
data_mid_b <- raw_mid_b %>%
    filter(dispnum != 258 |
           (sidea == 1 & ccode %in% side_a_258) |
           (sidea == 0 & ccode %in% side_b_258))
data_mid_b %>%
    group_by(dispnum, ccode) %>%
    summarise(n_sides = length(unique(sidea))) %>%
    verify(n_sides == 1) %>%
    invisible()

## Exclude side switchers from the MID dyad data and confirm now no duplicates
## remaining
data_mid_dyad <- raw_mid_dyad %>%
    filter(!(dispnum == 258 & sidea1 == 1 & ccode1 %in% side_b_258)) %>%
    filter(!(dispnum == 258 & sidea1 == 0 & ccode1 %in% side_a_258)) %>%
    filter(!(dispnum == 258 & sidea2 == 1 & ccode2 %in% side_b_258)) %>%
    filter(!(dispnum == 258 & sidea2 == 0 & ccode2 %in% side_a_258))
data_mid_dyad %>%
    group_by(dispnum) %>%
    summarise(sidea = list(c(ccode1[sidea1 == 1], ccode2[sidea2 == 1])),
              sideb = list(c(ccode1[sidea1 == 0], ccode2[sidea2 == 0]))) %>%
    mutate(overlap = map2_dbl(sidea, sideb, function(x, y) length(intersect(x, y)))) %>%
    verify(overlap == 0) %>%
    invisible()

## Remove South Korea's involvement in the one weird USA-North Korea MID where
## there's no Side A calculated for South Korea, then confirm that this doesn't
## appear in the dyadic MID data
data_mid_b <- data_mid_b %>%
    verify(!is.na(sidea) | (dispnum == 4455 & ccode == 732)) %>%
    filter(!is.na(sidea))
data_mid_dyad %>%
    assert(in_set(0, 1, allow.na = FALSE), sidea1, sidea2) %>%
    verify(dispnum != 4455 | (ccode1 != 732 & ccode2 != 732)) %>%
    invisible()

## Calculate how many observations we lose when we drop those without locations
id_no_latlong <- setdiff(data_mid_b$dispnum, data_mid_distance$id)
## cat("\nNumber of MIDs dropped due to no location data:",
##     length(id_no_latlong), "of", length(unique(data_mid_b$dispnum)), "\n")

## Remove disputes without location information from the participant-level and
## dyadic data frames
data_mid_b <- data_mid_b %>%
    filter(dispnum %in% !! data_mid_distance$id)
data_mid_dyad <- data_mid_dyad %>%
    filter(dispnum %in% !! data_mid_distance$id)


## --- Dispute-level data ---

## Main metadata: unique dispute identifier, whether war occurred, winning side
## if so
##
## Alternative definition for victory: B wins whenever A doesn't (treating A as
## the initiator, B as the target)
##
## Dropping all disputes for which we don't have location data
data_dispute_meta <- raw_mid_a %>%
    select(dispnum3, hostlev, fatality, outcome) %>%
    assert(not_na, dispnum3, hostlev) %>%
    assert(in_set(1:5), hostlev) %>%
    assert(in_set(0:6), fatality) %>%
    filter(dispnum3 %in% !! data_mid_distance$id) %>%
    transmute(id = dispnum3,
              war = !is.na(fatality) & fatality >= 2,
              win_a = war & !is.na(outcome) & outcome %in% c(1, 4),
              win_b = war & !is.na(outcome) & outcome %in% c(2, 3),
              win_b_alt = war & !win_a) %>%
    verify(!any(win_a & win_b)) %>%
    verify(!any(win_a & win_b_alt)) %>%
    mutate_all(as.numeric) %>%
    assert(not_na, everything())

## Code dyadic variables for the first year the dyad was in the dispute together
stopifnot(identical(as.integer(sort(unique(data_dispute_meta$id))),
                    as.integer(sort(unique(data_mid_dyad$dispnum)))))

data_dispute_vars <- foreach (imp = imp_dyad_year) %do% {
    data_mid_dyad %>%
        verify(ccode1 < ccode2) %>%
        verify(sidea1 != sidea2) %>%
        select(id = dispnum,
               ccode_a = ccode1,
               ccode_b = ccode2,
               year) %>%
        group_by(id, ccode_a, ccode_b) %>%
        summarise(year = min(year)) %>%
        ungroup() %>%
        left_join(imp, by = c("ccode_a", "ccode_b", "year")) %>%
        assert(not_na, everything()) %>%
        verify(!duplicated(paste0(id, ".", ccode_a, ".", ccode_b))) %>%
        group_by(id) %>%
        summarise(s_nowt = mean(s_nowt),
                  s_cinc = mean(s_cinc),
                  contig = mean(contig),
                  py = mean(py),
                  py_alt = mean(py_alt),
                  mp_any = max(mp_any),
                  mp_all = max(mp_all),
                  polity2_min = min(polity2_min),
                  polity2_max = min(polity2_max)) %>%
        assert(not_na, everything()) %>%
        assert(within_bounds(-1, 1), s_nowt, s_cinc) %>%
        assert(within_bounds(0, 1), contig) %>%
        assert(within_bounds(0, Inf), py, py_alt) %>%
        verify(mp_all <= mp_any) %>%
        verify(polity2_min <= polity2_max)
}

## Put the metadata together with the covariates
data_dispute <- foreach (imp = data_dispute_vars) %do% {
    data_dispute_meta %>%
        left_join(imp, by = "id") %>%
        assert(not_na, everything()) %>%
        verify(!duplicated(id))
}


## --- State-level data ---

## Main metadata: what side each state was on in the dispute in question, plus
## year of first involvement to use for covariates
state_metadata <- data_mid_b %>%
    select(id = dispnum, ccode, sidea, year = styear) %>%
    group_by(id, ccode) %>%
    summarise(sidea = unique(sidea),
              year = min(year)) %>%
    ungroup() %>%
    assert(not_na, everything())

## Merge in distance to dispute
state_metadata <- state_metadata %>%
    left_join(data_mid_distance, by = c("id", "ccode")) %>%
    assert(not_na, everything())

## Merge with covariates in each imputation
data_participant <- foreach (imp = imp_state_year) %do% {
    state_metadata %>%
        left_join(imp, by = c("ccode", "year")) %>%
        select(id, ccode, stateabb, statenme, sidea, year, everything()) %>%
        assert(not_na, -ends_with("_lag_pure"))
}

## Calculate number of participants in each dispute, and include as a dispute
## level covariate
n_by_dispute <- state_metadata %>%
    group_by(id) %>%
    summarise(n_states = n())

data_dispute <- foreach (imp = data_dispute) %do% {
    imp %>%
        left_join(n_by_dispute, by = "id") %>%
        verify(!duplicated(id)) %>%
        assert(not_na, everything())
}

## Other additions to dispute-level data calculated from participant-level data
data_dispute <- foreach (i = seq_along(data_dispute)) %do% {
  data_participant[[i]] %>%
    group_by(id) %>%
    summarise(polity_a = mean(polity2[sidea == 1]),
              polity_b = mean(polity2[sidea == 0]),
              majpow_a = max(majpow[sidea == 1]),
              majpow_b = max(majpow[sidea == 0]),
              n_states_a = sum(sidea == 1),
              n_states_b = sum(sidea == 0)) %>%
    left_join(data_dispute[[i]], ., by = "id")
}

## Sanity check
stopifnot(identical(as.integer(sort(unique(data_dispute[[1]]$id))),
                    as.integer(sort(unique(data_participant[[1]]$id)))))


## --- State fixed effects ---

## Identify when states are always on the same side in every dispute
fe_list <- data_participant[[1]] %>%
    mutate(disp_side = paste(id, sidea, sep = "-")) %>%
    group_by(stateabb) %>%
    summarise(involvement = paste(disp_side, collapse = ",")) %>%
    group_by(involvement) %>%
    mutate(fe = paste(stateabb, collapse = ",")) %>%
    ungroup() %>%
    select(-involvement)

## ## View the ones that are always grouped together
## fe_list %>%
##     filter(nchar(fe) > 3) %>%
##     print(n = Inf)

## # A tibble: 8 x 2
##   stateabb fe
##   <chr>    <chr>
## 1 AAB      AAB,BAR,DMA,JAM,SLU,SVG
## 2 BAR      AAB,BAR,DMA,JAM,SLU,SVG
## 3 DMA      AAB,BAR,DMA,JAM,SLU,SVG
## 4 JAM      AAB,BAR,DMA,JAM,SLU,SVG
## 5 SLO      SLO,SLV
## 6 SLU      AAB,BAR,DMA,JAM,SLU,SVG
## 7 SLV      SLO,SLV
## 8 SVG      AAB,BAR,DMA,JAM,SLU,SVG

## Merge into state data, and calculate time varying fixed effects for major
## powers
make_time_varying <- c("USA", "UKG", "JPN", "GMY", "RUS", "CHN")
data_participant <- foreach (imp = data_participant) %do% {
    imp %>%
        left_join(fe_list, by = "stateabb") %>%
        rename(fe_1 = fe) %>%
        mutate(period = year,
               period = if_else(period < 1820, 1820, period),
               period = if_else(period > 1980, 1980, period),
               period = period - (period %% 20) + 10,
               fe_2 = if_else(stateabb %in% make_time_varying,
                              paste0(fe_1, period),
                              fe_1)) %>%
        select(-period) %>%
        select(id, ccode, stateabb, statenme, fe_1, fe_2, sidea, year, everything()) %>%
        assert(not_na, -ends_with("_lag_pure")) %>%
        verify(!duplicated(paste(id, ccode, sep = ".")))
}


## ----------------------------------------------------------------------
## Confirm that generated data is identical to analysis data
## ----------------------------------------------------------------------

## Confirm that two lists of data frames are identical
confirm_same <- function(list_orig, list_repl) {
  ## Same length
  stopifnot(length(list_orig) == length(list_repl))

  ## i: imputation
  for (i in seq_along(list_orig)) {
    imp_orig <- list_orig[[i]]
    imp_repl <- list_repl[[i]]

    ## Same number of observations and variables
    stopifnot(all(dim(imp_orig) == dim(imp_repl)))

    ## Same column names
    stopifnot(all(colnames(imp_orig) == colnames(imp_repl)))

    ## j: column
    for (j in seq_len(ncol(imp_orig))) {
      col_orig <- imp_orig[[j]]
      col_repl <- imp_repl[[j]]

      if (!is.numeric(col_orig)) {
        ## Exact same values for non-numerical variables
        stopifnot(all(col_orig == col_repl))
      } else {
        ## Numerically identical values for numerical variables
        stopifnot(all.equal(col_orig, col_repl, check.attributes = FALSE))
      }
    }
  }

  cat("\nImputed datasets are identical\n")
}

data_dispute_rep <- data_dispute
data_participant_rep <- data_participant
imp_state_year_rep <- imp_state_year
imp_dyad_year_rep <- imp_dyad_year
rm(data_dispute, data_participant, imp_state_year, imp_dyad_year)
save(data_dispute_rep, file = "generated_dispute.rda")
save(data_participant_rep, file = "generated_participant.rda")
save(imp_state_year_rep, file = "generated_state_year.rda")
save(imp_dyad_year_rep, file = "generated_dyad_year.rda")

load("kr_analysis_dispute.rda")
load("kr_analysis_participant.rda")
load("kr_state_year.rda")
load("kr_dyad_year.rda")

confirm_same(data_dispute, data_dispute_rep)
confirm_same(data_participant, data_participant_rep)
confirm_same(imp_state_year, imp_state_year_rep)
confirm_same(imp_dyad_year, imp_dyad_year_rep)


date()
